#!/usr/bin/env python3
'''
Author: Paschalis Agapitos
Project: Mestizajes

Provides principal component analysis utilities for century-level embedding
representations, including explained-variance reporting and scree plot
generation. The module supports standardized preprocessing, reusable plotting,
and optional file export to help evaluate embedding structure across fields of
human activity.
'''

from pathlib import Path
from typing import Dict, Optional, Tuple

import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


class PCAReporter:
    """Produce scree plots from a matrix X."""

    def __init__(self, random_state: int = 42):
        self.random_state = random_state

    def scree_plot(
        self,
        X: np.ndarray,
        out_path: Path,
        show: bool = True,
    ) -> Tuple[np.ndarray, np.ndarray]:
        """
        Generate PCA scree plot showing explained variance.
        
        Parameters
        ----------
        X : np.ndarray
            Input matrix (n_samples, n_features)
        out_path : Path
            Path to save the plot
        show : bool
            Whether to display the plot
        
        Returns
        -------
        pve : np.ndarray
            Explained variance ratio per component
        pve_cum : np.ndarray
            Cumulative explained variance ratio
        """
        plt.style.use("ggplot")
        scaler = StandardScaler(with_mean=True, with_std=True)
        Xs = scaler.fit_transform(X)

        pca = PCA(random_state=self.random_state)
        pca.fit(Xs)

        pve = pca.explained_variance_ratio_
        pve_cum = pve.cumsum()

        fig, axes = plt.subplots(1, 2, figsize=(15, 6))

        ticks = np.arange(len(pve)) + 1
        axes[0].plot(ticks, pve, marker="o")
        axes[0].set_xlabel("Principal Component")
        axes[0].set_ylabel("PVE")
        axes[0].set_title("Scree plot")

        axes[1].plot(ticks, pve_cum, marker="o")
        axes[1].set_xlabel("Principal Component")
        axes[1].set_ylabel("Cumulative PVE")
        axes[1].set_title("Cumulative Scree plot")

        out_path = Path(out_path)
        out_path.parent.mkdir(parents=True, exist_ok=True)
        plt.savefig(out_path, dpi=500)
        if show:
            plt.show()
        plt.close(fig)

        return pve, pve_cum


def generate_pca_scree_plot(
    century_embeddings: Dict[int, np.ndarray],
    field_of_activity: str,
    scree_plot_path: Optional[Path] = None,
    show: bool = True
):
    """
    Generate PCA scree plot for century embeddings.
    
    Parameters
    ----------
    century_embeddings : Dict[int, np.ndarray]
        Dictionary of century embeddings
    field_of_activity : str
        Name of the field (for plot naming)
    scree_plot_path : Optional[Path]
        Optional path to save the plot
    show : bool
        Whether to display the plot
    
    Returns
    -------
    pve : np.ndarray
        Explained variance ratio per component
    pve_cum : np.ndarray
        Cumulative explained variance ratio
    """
    if century_embeddings:
        X_cent = np.vstack(list(century_embeddings.values()))
        pca_reporter = PCAReporter()
        
        if scree_plot_path is not None:
            return pca_reporter.scree_plot(X_cent, scree_plot_path, show=show)
        else:
            # Generate default path based on field of activity
            field_clean = field_of_activity.replace(" ", "_").replace("&", "and").lower()
            default_path = Path(f"../../../figures/embeddings_analysis/figures/embeddings_analysis/scree_plot_{field_clean}_cent_embeddings_tfidf.png")
            default_path.parent.mkdir(parents=True, exist_ok=True)
            return pca_reporter.scree_plot(X_cent, default_path, show=show)
    
    return None, None
